Exploratory Data Analysis and Feature Extraction¶


Installing Packages and Libraries¶

In [ ]:
# Uncomment to upgrade packages
#!pip3 install pandas --user --upgrade --quiet
#!pip3 install scipy --user --upgrade --quiet
#!pip3 install numpy --user --upgrade --quiet
#!pip3 install statsmodels --user --upgrade --quiet
#!pip3 install seaborn --user --upgrade --quiet
In [ ]:
#%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [ ]:
# extra imports
from pandas import read_csv
from scipy.stats import boxcox, shapiro, chi2, chi2_contingency
from matplotlib import patches
import seaborn as sns

Defining Functions¶

In [ ]:
def print_categorical_variables(df):
    categorical_vars = df.select_dtypes(include=['object', 'category']).columns
    print("Categorical Variables:")
    for var in categorical_vars:
        categories = df[var].unique().tolist()
        print(f"{var}: {', '.join(categories)}")

def print_numerical_variables(df):
    numerical_vars = df.select_dtypes(exclude=['object', 'category']).columns
    print("Numerical Variables:")
    for var in numerical_vars:
        print(var)

def split_cat_num_columns(df):
    #Split for further analysis between categorical and numerical variables
    cat_cols = []
    num_cols = []

    for col in df.columns:
        if df[col].dtype in ['object', 'category']:
            cat_cols.append(col)
        else:
            num_cols.append(col)
    return cat_cols, num_cols

def plot_dataframe(df, x=4, y =4):
    
    cat_cols, _ = split_cat_num_columns(df)
    
    # Define custom color palette
    colors = ['#648E9C', '#9CB1BC', '#C5D4DE', '#E8F1F4']

    # Create figure and axes
    fig, axes = plt.subplots(x, y, figsize=(18, 16))

    # Set axis labels font properties
    font_props = {'fontsize': 10}

    # Plot each variable
    for i, col in enumerate(df.columns):
        ax = axes.reshape(-1)[i]
        if col in cat_cols:
            sns.countplot(x=col, data=df, ax=ax, palette=colors)
            ax.set_xlabel(col, fontdict=font_props)
            ax.set_ylabel('Count', fontdict=font_props)

            # Add numbers on top of the bars
            for p in ax.patches:
                ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                            ha='center', va='bottom', xytext=(0, 5), textcoords='offset points', fontsize=8)
        else:
            sns.histplot(x=col, data=df, ax=ax, color=colors[0], kde=True, stat="density")
            kde_color = '#9C648E'  # Desired color for the Gaussian curves
            sns.kdeplot(x=col, data=df, ax=ax, color=kde_color, lw=1.5)
            ax.set_xlabel(col, fontdict=font_props)
            ax.set_ylabel('Density', fontdict=font_props)

        # Rotate x-axis labels
        ax.tick_params(axis='x', labelrotation=45)
        ax.tick_params(axis='both', labelsize=8)

        # Adjust y-axis limit to leave space for the numbers
        ax.set_ylim(0, ax.get_ylim()[1] * 1.15)

    # Adjust spacing between subplots
    plt.tight_layout(pad=2.0)

    # Display the plot without the messages
    plt.show();
    
def calculate_outliers(data, column):
    q1 = np.percentile(data[column], 25)
    q3 = np.percentile(data[column], 75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers = data[column][(data[column] < lower_bound) | (data[column] > upper_bound)]
    extreme_outliers = data[column][(data[column] < lower_bound - 3 * iqr) | (data[column] > upper_bound + 3 * iqr)]

    return q1, q3, iqr, lower_bound, upper_bound, outliers, extreme_outliers

def plot_boxplot_histogram(data, column):
    # Define custom color palette
    colors = ['#648E9C', '#9CB1BC', '#C5D4DE', '#E8F1F4']

    # Calculate outliers and extreme outliers
    q1, q3, iqr, lower_bound, upper_bound, outliers, extreme_outliers = calculate_outliers(data, column)

    # Create subplots with custom width ratios and figure size
    fig, axes = plt.subplots(1, 2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9, 5))

    # Box plot
    boxplot = axes[0].boxplot(data[column], showfliers=True)
    axes[0].set_title(f'{column} - Box Plot', fontsize=12)
    axes[0].set_ylabel(f'{column}', fontsize=10)
    
    # Plot the boundary lines for outliers in the box plot if they exist
    if outliers.any():
        axes[0].axhline(lower_bound, color='#9C648E', linestyle='--')
        axes[0].axhline(upper_bound, color='#9C648E', linestyle='--')
    
    # Plot the boundary lines for extreme outliers in the box plot if they exist
    if extreme_outliers.any():
        axes[0].axhline(lower_bound - 3 * iqr, color='#3F51B5', linestyle='--')
        axes[0].axhline(upper_bound + 3 * iqr, color='#3F51B5', linestyle='--')

    # Change color of extreme outliers to blue
    for flier in boxplot['fliers']:
        flier.set(marker='o', color='#3F51B5', alpha=0.5)

    # Histogram
    hist = sns.histplot(data=data, x=column, ax=axes[1], color=colors[0])
    axes[1].set_title(f'{column} - Histogram', fontsize=12)
    axes[1].set_xlabel(f'{column}', fontsize=10)
    axes[1].set_ylabel('Frequency', fontsize=10)

    # Plot the boundary lines for outliers in the histogram if they exist
    if outliers.any():
        hist.axvline(lower_bound, color='#9C648E', linestyle='--')
        hist.axvline(upper_bound, color='#9C648E', linestyle='--')
    
    # Plot the boundary lines for extreme outliers in the histogram if they exist
    if extreme_outliers.any():
        hist.axvline(lower_bound - 3 * iqr, color='#3F51B5', linestyle='--')
        hist.axvline(upper_bound + 3 * iqr, color='#3F51B5', linestyle='--')

    # Create legend for the plot
    legend_elements = [
        plt.Line2D([0], [0], marker='o', color='#9C648E', linestyle='--', markersize=5, label='Outlier Boundary'),
        plt.Line2D([0], [0], marker='o', color='#3F51B5', linestyle='--', markersize=5, alpha=0.5, label='Extreme Outlier')
    ]
    axes[1].legend(handles=legend_elements, loc='upper right')

    # Adjust tick label font size for both subplots
    for ax in axes:
        ax.tick_params(axis='both', labelsize=8)

    # Adjust spacing between subplots
    plt.tight_layout()

    # Display the plot
    plt.show();

def print_outlier_analysis(column, q1, q3, iqr, lower_bound, upper_bound, outliers, extreme_outliers):
    print("{} Outlier Analysis:".format(column))
    print("-----------------------------")
    print("First Quartile (Q1): {:.2f}".format(q1))
    print("Third Quartile (Q3): {:.2f}".format(q3))
    print("Interquartile Range (IQR): {:.2f}".format(iqr))
    print("Lower Bound: {:.2f}".format(lower_bound))
    print("Upper Bound: {:.2f}".format(upper_bound))
    print(f"Outliers Length: {len(outliers)}")
    print(f"Extreme Outliers Length: {len(extreme_outliers)}")
    
def bivariate_numerical_exploratory_analysis(df, target):
    plt.rcParams['font.size']= 10
    colors = ['#648E9C', '#9C648E']
    sns.pairplot(df, hue=target, palette=colors, plot_kws={'alpha': 0.75});

def plot_categorical_variables(data, target):
    categorical_vars = data.select_dtypes(include=['object', 'category']).columns

    num_plots = len(categorical_vars)
    num_cols = 3  # Number of columns in the subplot grid
    num_rows = (num_plots - 1) // num_cols + 1

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))
    axes = axes.flatten()
    for i, var in enumerate(categorical_vars):
        sns.countplot(x=var, hue=target, data=data, ax=axes[i], palette=["#648E9C", "#9C648E"])
        axes[i].set_title(f'{var} vs {target}')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Count')
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
        axes[i].legend(title=target)

    # Hide empty subplots
    for j in range(i + 1, num_plots):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show();

def correlation_heatmap(df):
    numeric_cols = df.select_dtypes(include='number')  # Select only numeric columns
    correlation = numeric_cols.corr()
    mask = np.triu(np.ones_like(correlation, dtype=bool))

    cmap = sns.color_palette(['#648E9C', '#9C648E'])
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation, annot=True, cmap=cmap, linewidths=0.5)
    plt.title('Correlation Matrix')
    # Rotate variable labels
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=45, ha='right')
    plt.show();

def generate_cross_tabulations(df, target_var):
    categorical_vars = df.select_dtypes(include=['object', 'category']).columns
    
    for var in categorical_vars:
        crosstab = pd.crosstab(df[var], df[target_var])
        row_sums = crosstab.sum(axis=0)
        col_sums = crosstab.sum(axis=1)
        
        print(f"Cross-tabulation for {var} and {target_var}:")
        print(crosstab)
        print("Row sums:")
        print(row_sums)
        print("Column sums:")
        print(col_sums)
        print("\n")

def perform_chi_squared_test(df, cat_cols):
    results = []
    
    for col1 in cat_cols:
        for col2 in cat_cols:
            if col1 != col2:
                contingency_table = pd.crosstab(df[col1], df[col2])
                chi2, p_value, _, _ = chi2_contingency(contingency_table)
                results.append((col1, col2, chi2, p_value))
    
    results_df = pd.DataFrame(results, columns=['Variable 1', 'Variable 2', 'Chi-square', 'P-value'])
    return results_df

SECTION 1: Reading the Dataset Breast_Cancer.csv¶

In this first part we are going to load the dataset, explore it and get some first insights.

In [ ]:
breast_cancer = read_csv("./Breast_Cancer.csv", header=0, delimiter=',')
breast_cancer.shape
Out[ ]:
(4024, 16)

As we can see, the dataset has 4024 observations and 16 columns (variables). Our target value is the 16th, which is the Status of the patient and it can be binary (Dead or Alive).

In [ ]:
breast_cancer.columns
Out[ ]:
Index(['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive', 'Survival Months', 'Status'],
      dtype='object')
In [ ]:
breast_cancer.rename(columns={"Reginol Node Positive" : "Regional Node Positive"}, inplace=True) # Fixing typo of the column name

Below we can see the first 5 rows of the dataset in order to have a brief overview of the available data.

In [ ]:
breast_cancer.head()
Out[ ]:
Age Race Marital Status T Stage N Stage 6th Stage differentiate Grade A Stage Tumor Size Estrogen Status Progesterone Status Regional Node Examined Regional Node Positive Survival Months Status
0 68 White Married T1 N1 IIA Poorly differentiated 3 Regional 4 Positive Positive 24 1 60 Alive
1 50 White Married T2 N2 IIIA Moderately differentiated 2 Regional 35 Positive Positive 14 5 62 Alive
2 58 White Divorced T3 N3 IIIC Moderately differentiated 2 Regional 63 Positive Positive 14 7 75 Alive
3 58 White Married T1 N1 IIA Poorly differentiated 3 Regional 18 Positive Positive 2 1 84 Alive
4 47 White Married T2 N1 IIB Poorly differentiated 3 Regional 41 Positive Positive 3 1 50 Alive

SECTION 2: Basic Inspection of the Dataset¶

To begin with, a quick look at the variables types is taking place.

In [ ]:
breast_cancer.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Regional Node Positive  4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Status                  4024 non-null   object
dtypes: int64(5), object(11)
memory usage: 503.1+ KB

As it was mentioned in the dataset's metadata, there are in total 16 variables, 5 of which are numerical and 11 categorical. Also, as it can be seen in the results of the previous command, there are not null values (missing values) included in the dataset. However, we need to take a look at the distribution of the numerical variables, as well as the categories of the categorical variables in order to be sure that the dataset does not contain erroneous data.

In [ ]:
breast_cancer.describe()
Out[ ]:
Age Tumor Size Regional Node Examined Regional Node Positive Survival Months
count 4024.000000 4024.000000 4024.000000 4024.000000 4024.000000
mean 53.972167 30.473658 14.357107 4.158052 71.297962
std 8.963134 21.119696 8.099675 5.109331 22.921430
min 30.000000 1.000000 1.000000 1.000000 1.000000
25% 47.000000 16.000000 9.000000 1.000000 56.000000
50% 54.000000 25.000000 14.000000 2.000000 73.000000
75% 61.000000 38.000000 19.000000 5.000000 90.000000
max 69.000000 140.000000 61.000000 46.000000 107.000000

The results here indicate that the numerical variables do not include any abnormally high maximums (e.g. 9999999), thus we can consider that the numerical variables luck of errors. Although, from the statistics calculated one can understand that there are some outliers in some of the variables. For example in variable Regional Node Positive, the value 46 is extremely far from the rest of the distribution. Outlier analysis is performed in the following sections.

As for the categorical variables by executing the following command, one can take a look at the available categories of the variables.

In [ ]:
print_categorical_variables(breast_cancer)
Categorical Variables:
Race: White, Black, Other
Marital Status: Married, Divorced, Single , Widowed, Separated
T Stage : T1, T2, T3, T4
N Stage: N1, N2, N3
6th Stage: IIA, IIIA, IIIC, IIB, IIIB
differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated
Grade: 3, 2, 1,  anaplastic; Grade IV
A Stage: Regional, Distant
Estrogen Status: Positive, Negative
Progesterone Status: Positive, Negative
Status: Alive, Dead

From the result, it can be concluded that the categorical variables, do not contain erroneous categories. Thus, missing or erroneous data is not present in this dataset. In case of missing or erroneous data in the dataset, imputation techniques need to be considered. However, if the percentage of erroneous or missing data for a specific variable is large enough (e.g. 70%) then the best solution would be to not consider it part of the analysis.

Just to be sure, the dropna command is being used, but it can be seen that not a single row is discarded, as the shape of the dataset remains the same as the origin one.

In [ ]:
breast_cancer.dropna(inplace=True)
breast_cancer.shape
Out[ ]:
(4024, 16)

Finally, a brief description of the available variables is taking place:

  • Numerical Variables:

    • Age: Age of the patient
    • Tumor size: Size of the tumor in millimeters
    • Regional Node Examined: Number of lymph nodes examined
    • Regional Node Positive: How many lymph nodes examined were detected having cancer cells.
    • Survival Months: Months since the diagnosis
  • Categorical Variables:

    • Race: Patient’s race. It can be ‘White’, ‘Black’ or ‘Other’ (American Indian/AK Native, Asian/Pacific Islander)
    • Marital Status: Patient’s marital status. It can be ‘Divorced’, ‘Married’, ‘Separated’, ‘Single’ or ‘Widowed’
    • T Stage: Refers to the size of the primary tumor. It can be ‘T1’, ‘T2’, ‘T3’ or ‘T4’
    • N Stage: Refers to the extent of lymph node involvement. It can be ‘N1’, ‘N2’ or ‘N3’
    • 6th Stage: Refers to the spread of the cancer cells and the size of the tumor. It can be ‘IIA’, ‘IIB’, ‘IIIA’, ‘IIIB’ or ‘IIIC’
    • differentiate: Refers to the identification of the tumor. It can be ‘Moderately differentiated’, ‘Poorly differentiated’, ‘Undifferentiated’ or ‘Well differentiated’
    • Grade: Determines the aggressiveness of the cancer and its likelihood to spread. It can be ‘1’, ‘2’, ‘3’ or ‘anaplastic;GradeIV’. This variable may need future labeling preprocessing.
    • A Stage: Refers to the early stage and whether it has spread or not. It can be ‘Distant’ or ‘Regional’
    • Estrogen Status: Indicates whether the cancer cells can have estrogen receptors. It can be ‘Positive’ or ‘Negative’
    • Progesterone Status: Indicates whether the cancer cells can have progesterone receptors. It can be ‘Positive’ or ‘Negative’
    • Status (Target Variable): Indicates patient’s status. It can be ‘Dead’ or ‘Alive’.

SECTION 3: Univariate Analysis¶

To begin with, we are deviding into two separate lists the column names of the numerical and categorical variables, respectively.

In [ ]:
print_categorical_variables(breast_cancer)
print("\n")
print_numerical_variables(breast_cancer)
Categorical Variables:
Race: White, Black, Other
Marital Status: Married, Divorced, Single , Widowed, Separated
T Stage : T1, T2, T3, T4
N Stage: N1, N2, N3
6th Stage: IIA, IIIA, IIIC, IIB, IIIB
differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated
Grade: 3, 2, 1,  anaplastic; Grade IV
A Stage: Regional, Distant
Estrogen Status: Positive, Negative
Progesterone Status: Positive, Negative
Status: Alive, Dead


Numerical Variables:
Age
Tumor Size
Regional Node Examined
Regional Node Positive
Survival Months

Once the variables have been separated into categorical and numerical, it is possible to create a single plot containing all the histograms of the numerical variables and the countplots of the categorical variables. With the following visualisation, an initial overview of the distribution of the available variables is possible.

In [ ]:
plot_dataframe(breast_cancer)

After analyzing the figures above, we can conclude that almost all of the variables have an unbalanced distribution. More speciffically:

Categorical Variables:

  • Race: Most of the women in the dataset are White leaving the other categories of those variables with comparable less observations.
  • Marital Status: Again, most of the women in the dataset are Married and the difference between the groups is huge (2643 married compared to 45 separated).
  • T Stage: Most of the available data refer to cases of T1 and T2, while T3 and T4 are less.
  • N Stage: Dame is true here for the category N1.
  • 6th Stage: In this case only the category IIIB lacks observations.
  • differentiate: The four categories are totally unbalanced, while the undifferentiate one contains only 19 out of 4024.
  • Grade : The same is true here for the category anaplastic; GradeIV.
  • A Stage: Distant alues are only 92 out of 4024.
  • Estrogen Status: Same is true here for category Negative.
  • Progesterone Status: Here as well for Negative values.
  • Status (Target Variable): The distribution is unbalacned for the target variable as well, since we obtain 616observations for Dead cases and 3408 for Alive.

Numerical Variables:

  • Age: The distribution of age is quite fine, since there are several observation for mostly all the ages between 30 and 70 years old.
  • Tumor Size: The distribtuion of the tumor sizes is left-skewed, meaning that there are some values that take small values, but at the same time some values are considered too high. This will lead to outliers and they need to be treated correctly.
  • Regional Node Examined: In this case the distribution is quite fine, only some extre cases occur for very small values of the variable, and some outliers exist in the high range of x-axis.
  • Regional Node Positive: This variable is hardly left-skewed, leading to the conlcusion that most of the observations have very small numbers for this variable, and a few include very high values.
  • Survival Months: It seems a bit right-skewed, thus observations with small numbers of survival months (0 to 40) need to be treated appropriately.

The comments mentioned here need to be addressed in order to avoid fitting a learning algorithm which will generate a model described by high bias. For this reason, below the Outliers Analysis is included.

Before moving on to the Outliers Analysis, Sharipo-normality test is performed on the numerical values. In that way it can be statistically proven which numerical variables follow a normal distribution. Later in the Bivariate Exploratory Analysis, Chi-Squared tests are performed on the categorical variables as well, in order to check correlation between them.

Shapiro Normality Test¶

In [ ]:
cat_cols, num_cols = split_cat_num_columns(breast_cancer)
#Check the normality
shapiro_results = []
for column in num_cols:
    stat, p_value = shapiro(breast_cancer[column])
    shapiro_results.append((column, stat, p_value))

shapiro_df = pd.DataFrame(shapiro_results, columns=['Variable', 'Statistic', 'P_Value'])
print(shapiro_df)
                 Variable  Statistic       P_Value
0                     Age   0.975857  1.395776e-25
1              Tumor Size   0.841386  0.000000e+00
2  Regional Node Examined   0.959947  5.811554e-32
3  Regional Node Positive   0.653167  0.000000e+00
4         Survival Months   0.962401  4.003365e-31

The results of the Shapiro-Wilk normality test reveal that none of the numerical variables (Age, Tumor Size, Regional Node Examined, Regional Node Positive, and Survival Months) exhibit a normal distribution, as indicated by the extremely small p-values obtained. This suggests that these variables deviate significantly from a normal distribution. To mitigate potential issues during the modeling phase, it is considered to appy suitable transformations or utilizing non-parametric models that do not rely on normality assumptions.

However, taking into account our numerical variables and by looking at the generated figures, it looks like Tumor Size could be transformed into a normal distribution with the usage of logarithmic transformation or the boxcox function. Here are the results:

In [ ]:
breast_cancer['Tumor Size'].apply(np.log10).hist(bins='auto',figsize=(8,8), color='#648E9C',alpha=0.7, edgecolor='black');
In [ ]:
# Apply Box-Cox transformation to 'Tumor Size'
transformed_tumor_size, lambda_ = boxcox(breast_cancer['Tumor Size'])

# Create a new figure with the desired size
plt.figure(figsize=(8, 8))

# Plot the histogram of transformed 'Tumor Size'
plt.hist(transformed_tumor_size, bins='auto', color='#648E9C', alpha=0.7, edgecolor='black')
plt.xlabel('Transformed Tumor Size')
plt.ylabel('Frequency')
plt.title('Histogram of Transformed Tumor Size')
plt.show();
In [ ]:
lambda_
Out[ ]:
0.10735796756858493

Since the lambda variable is equal to 0.10 it means that the transformation needed to be applied to Tumor Size is the cube root, meaning boxcox is prefered compared to logarithmic transformation. Although, the outliers analysis and the extraction of new features will take place before we move on to preprocessing and normalizing the dataset. This decision is taken in order to check the correctness of the data, but in order to devide the preprocessing and normalization of the training and test data. In that way, we can be sure about the results of the model on the test data that would indicate if the model is able to generalize well or not, in new unseen cases.

Univariate Outliers Analysis¶

During the development and training of learning algorithms the inclusion of outliers can result in high bias results, thus it is very important to treat them carefully, first by identifying them and finally impute, remove, or just acknowledge their existence. The analysis is completed for all the numerical variables of the dataset. Firstly, Tumor Size is being analyzed.

In [ ]:
plot_boxplot_histogram(breast_cancer, 'Tumor Size')
In [ ]:
ts_q1, ts_q3, ts_iqr, ts_lower_bound, ts_upper_bound, ts_outliers, ts_extreme_outliers = calculate_outliers(breast_cancer, 'Tumor Size')
print_outlier_analysis('Tumor Size', ts_q1, ts_q3, ts_iqr, ts_lower_bound, ts_upper_bound, ts_outliers, ts_extreme_outliers)
Tumor Size Outlier Analysis:
-----------------------------
First Quartile (Q1): 16.00
Third Quartile (Q3): 38.00
Interquartile Range (IQR): 22.00
Lower Bound: -17.00
Upper Bound: 71.00
Outliers Length: 222
Extreme Outliers Length: 5
In [ ]:
#ts_extreme_outliers
breast_cancer.loc[ts_extreme_outliers.index]
Out[ ]:
Age Race Marital Status T Stage N Stage 6th Stage differentiate Grade A Stage Tumor Size Estrogen Status Progesterone Status Regional Node Examined Regional Node Positive Survival Months Status
289 41 White Married T3 N3 IIIC Poorly differentiated 3 Regional 140 Positive Positive 41 15 51 Dead
740 49 White Married T3 N1 IIIA Moderately differentiated 2 Regional 140 Positive Positive 14 2 48 Alive
1007 60 White Divorced T3 N2 IIIA Moderately differentiated 2 Regional 140 Positive Positive 21 5 57 Alive
1512 63 White Married T4 N2 IIIB Moderately differentiated 2 Regional 140 Positive Positive 9 8 89 Alive
3965 47 White Married T3 N2 IIIA Well differentiated 1 Regional 140 Positive Positive 23 7 64 Alive
In [ ]:
extreme_outliers_tumor_size = breast_cancer.loc[ts_extreme_outliers.index].copy().reset_index()
breast_cancer.drop(ts_extreme_outliers.index, inplace=True)
breast_cancer.reset_index(drop=True, inplace=True)
In [ ]:
breast_cancer.shape
Out[ ]:
(4019, 16)

So here for the variable Tumor Size we can see that there 222 outliers in total (starting from values of 71), from which 5 are considered extreme outliers (having size equal to 140). By taking a look at the observations of extreme outlliers for Tumor Size, we consider those cases as extreme, thus they are going to be separated from the main dataset. Additionaly analysis about those cases will be completed during the modelling part of the project. The analysis is continued by including Regional Node Examined.

In [ ]:
plot_boxplot_histogram(breast_cancer, 'Regional Node Examined')
In [ ]:
rne_q1, rne_q3, rne_iqr, rne_lower_bound, rne_upper_bound, rne_outliers, rne_extreme_outliers = calculate_outliers(breast_cancer, 'Regional Node Examined')
print_outlier_analysis('Regional Node Examined', rne_q1, rne_q3, rne_iqr, rne_lower_bound, rne_upper_bound, rne_outliers, rne_extreme_outliers)
Regional Node Examined Outlier Analysis:
-----------------------------
First Quartile (Q1): 9.00
Third Quartile (Q3): 19.00
Interquartile Range (IQR): 10.00
Lower Bound: -6.00
Upper Bound: 34.00
Outliers Length: 71
Extreme Outliers Length: 0
In [ ]:
rne_outliers.describe()
Out[ ]:
count    71.000000
mean     41.211268
std       6.313060
min      35.000000
25%      36.000000
50%      39.000000
75%      44.500000
max      61.000000
Name: Regional Node Examined, dtype: float64

For Regional Node Examined it can be seen that there are not any extreme outliers, while only 72 values are considered as outliers, but as the values seem logical we are keeping all the information in the dataset. Let's move on to Regional Node Positive.

In [ ]:
plot_boxplot_histogram(breast_cancer, 'Regional Node Positive')
In [ ]:
rnp_q1, rnp_q3, rnp_iqr, rnp_lower_bound, rnp_upper_bound, rnp_outliers, rnp_extreme_outliers = calculate_outliers(breast_cancer, 'Regional Node Positive')
print_outlier_analysis('Regional Node Positive', rnp_q1, rnp_q3, rnp_iqr, rnp_lower_bound, rnp_upper_bound, rnp_outliers, rnp_extreme_outliers)
Regional Node Positive Outlier Analysis:
-----------------------------
First Quartile (Q1): 1.00
Third Quartile (Q3): 5.00
Interquartile Range (IQR): 4.00
Lower Bound: -5.00
Upper Bound: 11.00
Outliers Length: 343
Extreme Outliers Length: 54
In [ ]:
rnp_extreme_outliers.describe()
Out[ ]:
count    54.000000
mean     28.592593
std       4.478538
min      24.000000
25%      26.000000
50%      27.500000
75%      29.750000
max      46.000000
Name: Regional Node Positive, dtype: float64

For Regional Node Positive there are 54 extreme outliers (with values more than 23), while 344 values are considered as outliers. By checking the extreme outliers, we can conlude that those extreme cases, are not so extreme afterall, so we will consider them for the training of the model. Lastly Survival Months is analyzed.

In [ ]:
plot_boxplot_histogram(breast_cancer, 'Survival Months')
In [ ]:
sm_q1, sm_q3, sm_iqr, sm_lower_bound, sm_upper_bound, sm_outliers, sm_extreme_outliers = calculate_outliers(breast_cancer, 'Survival Months')
print_outlier_analysis('Survival Months', sm_q1, sm_q3, sm_iqr, sm_lower_bound, sm_upper_bound, sm_outliers, sm_extreme_outliers)
Survival Months Outlier Analysis:
-----------------------------
First Quartile (Q1): 56.00
Third Quartile (Q3): 90.00
Interquartile Range (IQR): 34.00
Lower Bound: 5.00
Upper Bound: 141.00
Outliers Length: 18
Extreme Outliers Length: 0
In [ ]:
breast_cancer.loc[sm_outliers.index]
Out[ ]:
Age Race Marital Status T Stage N Stage 6th Stage differentiate Grade A Stage Tumor Size Estrogen Status Progesterone Status Regional Node Examined Regional Node Positive Survival Months Status
413 55 White Married T1 N1 IIA Moderately differentiated 2 Regional 15 Positive Positive 9 1 3 Alive
678 62 White Married T2 N2 IIIA Moderately differentiated 2 Regional 25 Positive Positive 13 4 4 Dead
720 49 White Married T2 N3 IIIC Moderately differentiated 2 Regional 32 Positive Positive 20 11 3 Alive
894 67 White Married T3 N2 IIIA Poorly differentiated 3 Regional 55 Positive Positive 9 9 4 Dead
919 43 Other Married T2 N3 IIIC Moderately differentiated 2 Regional 40 Positive Positive 19 11 1 Alive
926 64 White Single T2 N1 IIB Moderately differentiated 2 Regional 22 Positive Positive 1 1 3 Dead
1039 64 White Divorced T2 N2 IIIA Moderately differentiated 2 Regional 25 Positive Positive 9 4 4 Dead
1153 67 White Married T2 N1 IIB Poorly differentiated 3 Regional 25 Positive Positive 4 1 2 Dead
1698 59 White Single T3 N1 IIIA Moderately differentiated 2 Regional 70 Positive Positive 9 1 4 Dead
1705 63 White Married T2 N2 IIIA Moderately differentiated 2 Regional 35 Positive Positive 21 5 3 Dead
1727 61 Black Widowed T2 N3 IIIC Poorly differentiated 3 Regional 47 Positive Positive 21 21 4 Dead
1747 46 White Divorced T1 N1 IIA Moderately differentiated 2 Regional 19 Positive Positive 26 1 2 Dead
1889 49 White Divorced T2 N1 IIB Moderately differentiated 2 Regional 38 Positive Negative 15 3 4 Dead
2226 47 Other Married T2 N2 IIIA Moderately differentiated 2 Regional 45 Positive Positive 25 9 2 Alive
2406 69 White Married T1 N1 IIA Moderately differentiated 2 Regional 12 Positive Negative 9 1 4 Dead
3085 63 White Married T2 N2 IIIA Moderately differentiated 2 Regional 26 Positive Positive 28 9 4 Dead
3470 58 Black Widowed T1 N2 IIIA Moderately differentiated 2 Regional 2 Positive Negative 11 4 4 Dead
3799 53 White Separated T1 N1 IIA Moderately differentiated 2 Regional 17 Positive Positive 1 1 4 Alive

For Survival Months extreme outliers are not present, and only 18 observations are considered outliers, which correspond to cases that had values smaller than 5 months. Also by taking a closer look at those observations it can be derived that in most of the cases, those outliers with small values of Survival Months variable correspond to Dead cases, thus we retain those observations inside the dataset.

Followig by the Bivariate Exploratory Analysis is taking place. Partof the Bivariate Exploratory Analysis is the section Multivariate Outliers Analysis which is performed with Mahalobis distance.


SECTION 4: Bivariate Exploratory Analysis¶

In this section, a deeper look is taken into the relationship between the pairs of available variables with respect to the target variable Status.

Firstly, the relation between the numerical variables with respect to the target variable is depicted in the following figure.

In [ ]:
bivariate_numerical_exploratory_analysis(breast_cancer, 'Status')

Before stating the colcusions of the generated figure, let's take a look at the Corellation Heatmap below as well.

In [ ]:
correlation_heatmap(breast_cancer)

We see no strong correlation between numerical variables, some insights we can get, for example, are the following:

  • Considering the correlation between Age and Tumor Size, we can see that they are indipendent as the value indicated is -0.077.
  • The correlation between Regional Node Examined and Regional Node Positive indicates a moderate positive correlation of 0.412, thus, as the number of regional nodes examined increases, the number of positive nodes also tends to increase.
  • The correlation between Survival Months and other variables is relatively weak. This indicates a weak or no linear relationship between survival months and the other numerical variables in the dataset.

However, by taking a look at the scatter plot of the numerical variables (before the correlation matrix), one can understand that the observations of Alive and Dead classes, are mixed when a combination of two numerical variables occurs. By that it is meant, that there is not clear separation between a pair of numerical variables for distinguishing Alive from Dead cases. The only interesting insight from the scatterplot occures for the variable Survival Months, which seems to generate a good separation boundary between the two classes, for all the combinations of the remaining numerical variables. This insight leads to the conclusion that the variable Survival Months can offer quite strong predictability power to the model.

To continue with, in the next plot, the distribution of the categorical variables with respect to the target varible is presented.

In [ ]:
plot_categorical_variables(breast_cancer, "Status")

Before conlcuding about the categorical variables, it is necessary to statistically test if correlation exist between them by using the Chi-Squared test.

In [ ]:
chi_squared_results = perform_chi_squared_test(breast_cancer, cat_cols)
print(chi_squared_results)
chi_squared_results.to_csv("./chi-2.csv")
    Variable 1           Variable 2  Chi-square       P-value
0         Race       Marital Status  137.649912  7.308896e-26
1         Race             T Stage     8.309501  2.162940e-01
2         Race              N Stage    6.215014  1.836560e-01
3         Race            6th Stage    8.923983  3.487492e-01
4         Race        differentiate   27.855377  1.000418e-04
..         ...                  ...         ...           ...
105     Status        differentiate  111.262966  5.868421e-24
106     Status                Grade  111.262966  5.868421e-24
107     Status              A Stage   35.795768  2.191234e-09
108     Status      Estrogen Status  135.274057  2.876016e-31
109     Status  Progesterone Status  125.062415  4.931880e-29

[110 rows x 4 columns]

By considering the barplots above, there is not clear conclusions to be made, due to the unbalanced nature of the dataset as mentioned earlier. However, some interesting insights generated by this figure is that for the variables T Stage, N Stage and 6th Stage, it can be observed that for the categories T4, N3 and IIC, respectively, the difference between the Alive and Dead cases, tend to get smaller, which might mean that these specific categories my help the model recognise patterns for Dead cases. Finally, it can be observed that in some of them, the probability of survival increases, for example, when the patient has a T1 value for the T Stage variable.

In additoin, by taking a look at the produced csv file, the following conclusions can be derived:

  1. Race and Marital Status: There is a significant association between race and marital status (Chi-square = 137.96, p < 0.001).
  2. Race and other variables: Race does not show a significant association with T Stage, N Stage, 6th Stage, A Stage, or Progesterone Status. However, it is significantly associated with differentiation, grade, estrogen status, and overall status (p < 0.05).
  3. Marital Status and other variables: Marital status is significantly associated with T Stage, N Stage, differentiation, grade, and overall status (p < 0.05).
  4. T Stage, N Stage, and other variables: T Stage and N Stage are strongly associated with each other (Chi-square = 323.41, p < 0.001) and show significant associations with differentiation, grade, 6th Stage, A Stage, estrogen status, progesterone status, and overall status (p < 0.05).
  5. 6th Stage and other variables: 6th Stage is strongly associated with T Stage, N Stage, differentiation, grade, A Stage, estrogen status, progesterone status, and overall status (p < 0.001).
  6. Differentiation and other variables: Differentiation is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, grade, A Stage, estrogen status, progesterone status, and overall status (p < 0.05).
  7. Grade and other variables: Grade is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, differentiation, A Stage, estrogen status, progesterone status, and overall status (p < 0.05).
  8. A Stage and other variables: A Stage shows a significant association with differentiation, estrogen status, and overall status (p < 0.05).
  9. Estrogen Status and other variables: Estrogen status is significantly associated with race, T Stage, N Stage, 6th Stage, differentiation, grade, A Stage, progesterone status, and overall status (p < 0.05).
  10. Progesterone Status and other variables: Progesterone status is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, differentiation, grade, estrogen status, and overall status (p < 0.05).
  11. Overall Status and other variables: Overall status is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, differentiation, grade, A Stage, estrogen status, and progesterone status (p < 0.05).

Multivariate Outliers Analysis¶

In addition, though, we'll perform a multivariate outlier detection using Mahalanobis Distance o the numerical variables of the dataset. In order to do so the calculation of the covariance matrix and its inverse is necessary in order to calculate the distances between the observations of the dataset.

In [ ]:
df_mahalanobis = breast_cancer[num_cols]
df_mahalanobis;
In [ ]:
df_mahalanobis = df_mahalanobis.to_numpy()

#Covariance matrix
cov_matrix = np.cov(df_mahalanobis, rowvar=False)
cov_matrix_pm1 = np.linalg.matrix_power(cov_matrix, -1)
centerpoint = np.mean(df_mahalanobis, axis=0)

The cutoff value for detecting outliers is set to 1% based on the Chi-Square distribution. Thus, 0.99 represents the desired significance level.

In [ ]:
# Distances between center point 
distances = []
for i, val in enumerate(df_mahalanobis):
      p1 = val
      p2 = centerpoint
      distance = (p1-p2).T.dot(cov_matrix_pm1).dot(p1-p2)
      distances.append(distance)
distances = np.array(distances)

# Cutoff (threshold) value from Chi-Sqaure Distribution for detecting outliers 
cutoff = chi2.ppf(0.99, df_mahalanobis.shape[1])

# Index of outliers
outlierIndexes = np.where(distances > cutoff )

# print('--- Index of Outliers ----')
# print(outlierIndexes)

print('--- Number of Outliers ----')
print(len(outlierIndexes[0]))

df_multiv_outliers = df_mahalanobis[ distances > cutoff , :]
--- Number of Outliers ----
177
In [ ]:
breast_cancer.iloc[outlierIndexes[0]].describe(include='all')
Out[ ]:
Age Race Marital Status T Stage N Stage 6th Stage differentiate Grade A Stage Tumor Size Estrogen Status Progesterone Status Regional Node Examined Regional Node Positive Survival Months Status
count 177.000000 177 177 177 177 177 177 177 177 177.000000 177 177 177.000000 177.000000 177.000000 177
unique NaN 3 5 4 3 5 4 4 2 NaN 2 2 NaN NaN NaN 2
top NaN White Married T3 N3 IIIC Moderately differentiated 2 Regional NaN Positive Positive NaN NaN NaN Alive
freq NaN 155 101 73 106 106 89 89 159 NaN 152 129 NaN NaN NaN 106
mean 54.254237 NaN NaN NaN NaN NaN NaN NaN NaN 60.474576 NaN NaN 26.960452 15.920904 59.214689 NaN
std 9.893053 NaN NaN NaN NaN NaN NaN NaN NaN 40.255303 NaN NaN 12.472862 11.010556 30.326803 NaN
min 30.000000 NaN NaN NaN NaN NaN NaN NaN NaN 1.000000 NaN NaN 3.000000 1.000000 2.000000 NaN
25% 47.000000 NaN NaN NaN NaN NaN NaN NaN NaN 23.000000 NaN NaN 19.000000 5.000000 37.000000 NaN
50% 55.000000 NaN NaN NaN NaN NaN NaN NaN NaN 50.000000 NaN NaN 26.000000 17.000000 60.000000 NaN
75% 62.000000 NaN NaN NaN NaN NaN NaN NaN NaN 100.000000 NaN NaN 35.000000 25.000000 83.000000 NaN
max 69.000000 NaN NaN NaN NaN NaN NaN NaN NaN 133.000000 NaN NaN 61.000000 46.000000 107.000000 NaN
In [ ]:
## Finding ellipse dimensions 
pearson = cov_matrix[0, 1]/np.sqrt(cov_matrix[0, 0] * cov_matrix[1, 1])
ell_radius_x = np.sqrt(1 + pearson)
ell_radius_y = np.sqrt(1 - pearson)
lambda_, v = np.linalg.eig(cov_matrix)
lambda_ = np.sqrt(lambda_)

# Ellipse patch
ellipse = patches.Ellipse(xy=(centerpoint[0], centerpoint[1]),
                          width=lambda_[0]*np.sqrt(cutoff)*2, height=lambda_[1]*np.sqrt(cutoff)*2,
                          angle=np.rad2deg(np.arccos(v[0, 0])), edgecolor='#9C648E')
ellipse.set_facecolor('#648E9C')  # Set the facecolor to '#9C648E'
ellipse.set_alpha(0.5)

fig = plt.figure()
ax = plt.subplot()
ax.add_artist(ellipse)

# Scatter plot
plt.scatter(df_mahalanobis[:, 0], df_mahalanobis[:, 1], color='#9C648E')  # Set the scatter plot color to '#648E9C'

# Set plot title and labels
plt.title('Outlier Detection')
plt.xlabel('X')
plt.ylabel('Y')

plt.show();

As the above figure presents, based on the Mahalanobis distance and Chi-square distribution with 0.99 significance level, there are 177 multivariate outliers. After carefull consideration, by taking a look at those observations, we consider retaining them inside the dataset, since their maximum values seem logical. Although, we might return to this point in case of failures during the modelling phase.


SECTION 5: Treatment of Mixed Data¶

With the following command it is clear that all categorical variables are set to be of type object.

In [ ]:
breast_cancer.dtypes
Out[ ]:
Age                        int64
Race                      object
Marital Status            object
T Stage                   object
N Stage                   object
6th Stage                 object
differentiate             object
Grade                     object
A Stage                   object
Tumor Size                 int64
Estrogen Status           object
Progesterone Status       object
Regional Node Examined     int64
Regional Node Positive     int64
Survival Months            int64
Status                    object
dtype: object
In [ ]:
print_categorical_variables(breast_cancer)
Categorical Variables:
Race: White, Black, Other
Marital Status: Married, Divorced, Single , Widowed, Separated
T Stage : T1, T2, T3, T4
N Stage: N1, N2, N3
6th Stage: IIA, IIIA, IIIC, IIB, IIIB
differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated
Grade: 3, 2, 1,  anaplastic; Grade IV
A Stage: Regional, Distant
Estrogen Status: Positive, Negative
Progesterone Status: Positive, Negative
Status: Alive, Dead

For presentation reasons, we are changing the value anaplastic; Grade IV of the variable Grade to 4, and then all the values to latin numbers in order to be identical.

In [ ]:
breast_cancer['Grade'].replace(' anaplastic; Grade IV', '4', inplace=True)
breast_cancer['Grade'].replace({'1': 'I', '2': 'II', '3': 'III', '4': 'IV'}, inplace=True)
In [ ]:
print_categorical_variables(breast_cancer)
Categorical Variables:
Race: White, Black, Other
Marital Status: Married, Divorced, Single , Widowed, Separated
T Stage : T1, T2, T3, T4
N Stage: N1, N2, N3
6th Stage: IIA, IIIA, IIIC, IIB, IIIB
differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated
Grade: III, II, I, IV
A Stage: Regional, Distant
Estrogen Status: Positive, Negative
Progesterone Status: Positive, Negative
Status: Alive, Dead

Now it is necessary to create an order for the variables T Stage, N Stage, 6th Stage, differentiate, and Grade.

In [ ]:
# Define the custom order for each categorical variable
t_stage_order = ['T1', 'T2', 'T3', 'T4']
n_stage_order = ['N1', 'N2', 'N3']
stage_6_order = ['IIA', 'IIB', 'IIIA', 'IIIB', 'IIIC']
differentiate_order = ['Undifferentiated', 'Poorly differentiated', 'Moderately differentiated', 'Well differentiated']
grade_order = ['I', 'II','III', 'IV']

# Apply the custom order to the categorical variables
breast_cancer['T Stage '] = pd.Categorical(breast_cancer['T Stage '], categories=t_stage_order, ordered=True)
breast_cancer['N Stage'] = pd.Categorical(breast_cancer['N Stage'], categories=n_stage_order, ordered=True)
breast_cancer['6th Stage'] = pd.Categorical(breast_cancer['6th Stage'], categories=stage_6_order, ordered=True)
breast_cancer['differentiate'] = pd.Categorical(breast_cancer['differentiate'], categories=differentiate_order, ordered=True)
breast_cancer['Grade'] = pd.Categorical(breast_cancer['Grade'], categories=grade_order, ordered=True)

# Print the updated order of the categorical variables
print("Categorical Variables with Custom Order:")
print("T Stage:", breast_cancer['T Stage '].cat.categories)
print("N Stage:", breast_cancer['N Stage'].cat.categories)
print("6th Stage:", breast_cancer['6th Stage'].cat.categories)
print("differentiate:", breast_cancer['differentiate'].cat.categories)
print("Grade:", breast_cancer['Grade'].cat.categories)
Categorical Variables with Custom Order:
T Stage: Index(['T1', 'T2', 'T3', 'T4'], dtype='object')
N Stage: Index(['N1', 'N2', 'N3'], dtype='object')
6th Stage: Index(['IIA', 'IIB', 'IIIA', 'IIIB', 'IIIC'], dtype='object')
differentiate: Index(['Undifferentiated', 'Poorly differentiated',
       'Moderately differentiated', 'Well differentiated'],
      dtype='object')
Grade: Index(['I', 'II', 'III', 'IV'], dtype='object')
In [ ]:
import warnings
# Get the summary of categorical data
summary = pd.DataFrame(columns=['Variable', 'Value', 'Count'])

# Iterate over each column
for column in breast_cancer.columns:
    # Check if the column is categorical
    if breast_cancer[column].dtype in ['object', 'category']:
        # Calculate the value counts
        value_counts = breast_cancer[column].value_counts().reset_index()
        value_counts.columns = ['Value', 'Count']

        # Add the variable name to the summary
        value_counts['Variable'] = column

        # Append to the summary DataFrame
        summary = pd.concat([summary, value_counts], ignore_index=True)

# Print the summary
print(summary)
               Variable                      Value Count
0                  Race                      White  3408
1                  Race                      Other   320
2                  Race                      Black   291
3        Marital Status                    Married  2639
4        Marital Status                    Single    615
5        Marital Status                   Divorced   485
6        Marital Status                    Widowed   235
7        Marital Status                  Separated    45
8              T Stage                          T2  1786
9              T Stage                          T1  1603
10             T Stage                          T3   529
11             T Stage                          T4   101
12              N Stage                         N1  2731
13              N Stage                         N2   817
14              N Stage                         N3   471
15            6th Stage                        IIA  1305
16            6th Stage                        IIB  1130
17            6th Stage                       IIIA  1047
18            6th Stage                       IIIC   471
19            6th Stage                       IIIB    66
20        differentiate  Moderately differentiated  2348
21        differentiate      Poorly differentiated  1110
22        differentiate        Well differentiated   542
23        differentiate           Undifferentiated    19
24                Grade                         II  2348
25                Grade                        III  1110
26                Grade                          I   542
27                Grade                         IV    19
28              A Stage                   Regional  3927
29              A Stage                    Distant    92
30      Estrogen Status                   Positive  3750
31      Estrogen Status                   Negative   269
32  Progesterone Status                   Positive  3321
33  Progesterone Status                   Negative   698
34               Status                      Alive  3404
35               Status                       Dead   615

SECTION 6: Feature Extraction¶

From all the variables we have, we wanted to extract some new variables. Precisely, we will extract three categorical and one continuous (numerical):

  • Age Group: We will group the age between 'Young', 'Middle-aged' and 'Elderly'
  • Race Group: As there is a big gap between the number of pacients from different races, we will try to minimize a bit indicating if they are 'Caucasian' or 'Non-caucasian'
  • Tumor Stage Group: We will group T1 into 'Early Stage' and T2, T3 and T4 into 'Advanced Stage'.

The continuous one will be the following:

  • Age at diagnosis: When was the patient diagnosed
  • Positive lymph Ratio: Ratio of positive to examined lymph nodes
In [ ]:
# Create a new column for Age Group based on custom age ranges
breast_cancer['Age Group'] = pd.cut(breast_cancer['Age'], bins=[30, 50, 60, np.inf], labels=['Young', 'Middle-aged', 'Elderly'])

# Print the updated dataframe with the Age Group variable
print(breast_cancer[['Age', 'Age Group']])
print(breast_cancer['Age Group'].value_counts())
      Age    Age Group
0      68      Elderly
1      50        Young
2      58  Middle-aged
3      58  Middle-aged
4      47        Young
...   ...          ...
4014   62      Elderly
4015   56  Middle-aged
4016   68      Elderly
4017   58  Middle-aged
4018   46        Young

[4019 rows x 2 columns]
Age Group
Young          1490
Middle-aged    1385
Elderly        1139
Name: count, dtype: int64

In order to have balanced data in this new variable, the boundaries set were the following:

  • From 30 to 50 --> Young
  • From 50 to 60 --> Middle-aged
  • From 60 to ∞ --> Elderly
In [ ]:
# Create a new column for broader race groups
breast_cancer['Race Group'] = ''

# Assign the broader race group based on Race values
breast_cancer['Race Group'] = np.where(breast_cancer['Race'] == 'White', 'Caucasian', breast_cancer['Race Group'])
breast_cancer['Race Group'] = np.where(breast_cancer['Race'].isin(['Other', 'Black']), 'Non-Caucasian', breast_cancer['Race Group'])

# Print the updated dataframe with the Race and Race Group variables
print(breast_cancer[['Race', 'Race Group']])
print(breast_cancer['Race Group'].value_counts())
       Race     Race Group
0     White      Caucasian
1     White      Caucasian
2     White      Caucasian
3     White      Caucasian
4     White      Caucasian
...     ...            ...
4014  Other  Non-Caucasian
4015  White      Caucasian
4016  White      Caucasian
4017  Black  Non-Caucasian
4018  White      Caucasian

[4019 rows x 2 columns]
Race Group
Caucasian        3408
Non-Caucasian     611
Name: count, dtype: int64

In this second variable we consider splitting between 'Caucasian' and 'Non-caucasian' race. Even though we minimize the gap between races, it is still extense. We keep it for now.

In [ ]:
# Create a new column for Tumor Stage Group
breast_cancer['Tumor Stage Group'] = ''

# Assign the Tumor Stage Group based on T Stage values
breast_cancer['Tumor Stage Group'] = np.where(breast_cancer['T Stage ']=='T1', 'Early Stage', breast_cancer['Tumor Stage Group'])
breast_cancer['Tumor Stage Group'] = np.where(breast_cancer['T Stage '].isin([ 'T2', 'T3', 'T4']), 'Advanced Stage', breast_cancer['Tumor Stage Group'])

# Print the updated dataframe with the Tumor Stage Group variable
print(breast_cancer[['T Stage ', 'Tumor Stage Group']])
print(breast_cancer['Tumor Stage Group'].value_counts())
     T Stage  Tumor Stage Group
0          T1       Early Stage
1          T2    Advanced Stage
2          T3    Advanced Stage
3          T1       Early Stage
4          T2    Advanced Stage
...       ...               ...
4014       T1       Early Stage
4015       T2    Advanced Stage
4016       T2    Advanced Stage
4017       T2    Advanced Stage
4018       T2    Advanced Stage

[4019 rows x 2 columns]
Tumor Stage Group
Advanced Stage    2416
Early Stage       1603
Name: count, dtype: int64
In [ ]:
#Continuous
breast_cancer['Age at Diagnosis'] = breast_cancer['Age'] - (breast_cancer['Survival Months']//12)
In [ ]:
breast_cancer['Positive Lymph Ratio'] = breast_cancer['Regional Node Positive'] / breast_cancer['Regional Node Examined']
In [ ]:
breast_cancer.head(4)
Out[ ]:
Age Race Marital Status T Stage N Stage 6th Stage differentiate Grade A Stage Tumor Size ... Progesterone Status Regional Node Examined Regional Node Positive Survival Months Status Age Group Race Group Tumor Stage Group Age at Diagnosis Positive Lymph Ratio
0 68 White Married T1 N1 IIA Poorly differentiated III Regional 4 ... Positive 24 1 60 Alive Elderly Caucasian Early Stage 63 0.041667
1 50 White Married T2 N2 IIIA Moderately differentiated II Regional 35 ... Positive 14 5 62 Alive Young Caucasian Advanced Stage 45 0.357143
2 58 White Divorced T3 N3 IIIC Moderately differentiated II Regional 63 ... Positive 14 7 75 Alive Middle-aged Caucasian Advanced Stage 52 0.500000
3 58 White Married T1 N1 IIA Poorly differentiated III Regional 18 ... Positive 2 1 84 Alive Middle-aged Caucasian Early Stage 51 0.500000

4 rows × 21 columns

In [ ]:
#generate_cross_tabulations(breast_cancer, 'Status')

Description of Final Dataset¶

In [ ]:
plot_dataframe(breast_cancer,3,7)

One comment here that we would like to point out, is that we tried to normalize the variables Tumor Size and Regional Node Positive but the results of the sharipo test again give negative results. The code is excluded from this final deliverable.

In [ ]:
bivariate_numerical_exploratory_analysis(breast_cancer, 'Status')
In [ ]:
plot_categorical_variables(breast_cancer, "Status")
In [ ]:
correlation_heatmap(breast_cancer)

By following the same logic described during the Univariate and Bivariate analysis, conclusions can be derived for the new generated features.


SECTION 7: Saving the Dataset¶

To prevent an oredered learning from our model, we will shuffle the data. Also, as we do not have a huge dataset and compression is not necessary, we will save the new data into a csv file.

Important Note¶

It's important to note here that, the preprocessing (applying BoxCox transformation to Tumor Size and Positive Lymph Ratio, or One-hot encoding to categorical variables) as well as the normalization of the data with the Min-Max Scaler is taking place in the modelling notebooks, in order to be able to apply the transformations and the normalisation techniques on the training and test datasets separately.

In [ ]:
np.random.seed(666)
breast_cancer_new = breast_cancer.sample(frac=1).reset_index(drop=True)
In [ ]:
breast_cancer_new.to_csv('breast_cancer_new.csv', index=False)
In [ ]:
breast_cancer_read = read_csv("breast_cancer_new.csv", header=0, delimiter=',')
In [ ]:
breast_cancer_read.head(4)
Out[ ]:
Age Race Marital Status T Stage N Stage 6th Stage differentiate Grade A Stage Tumor Size ... Progesterone Status Regional Node Examined Regional Node Positive Survival Months Status Age Group Race Group Tumor Stage Group Age at Diagnosis Positive Lymph Ratio
0 57 White Married T4 N3 IIIC Poorly differentiated III Distant 85 ... Positive 31 18 41 Alive Middle-aged Caucasian Advanced Stage 54 0.580645
1 47 White Married T2 N1 IIB Moderately differentiated II Regional 23 ... Positive 6 5 50 Alive Young Caucasian Advanced Stage 43 0.833333
2 37 White Single T2 N1 IIB Moderately differentiated II Regional 23 ... Positive 17 3 71 Alive Young Caucasian Advanced Stage 32 0.176471
3 45 White Married T1 N1 IIA Poorly differentiated III Regional 20 ... Positive 15 1 97 Alive Young Caucasian Early Stage 37 0.066667

4 rows × 21 columns